@@ -92,7 +92,7 @@ gem 'devise','~> 4.2.0' |
||
92 | 92 |
gem 'em-http-request', '~> 1.1.2' |
93 | 93 |
gem 'faraday', '~> 0.9.0' |
94 | 94 |
gem 'faraday_middleware', github: 'lostisland/faraday_middleware', branch: 'master' # '>= 0.10.1' |
95 |
-gem 'feed-normalizer' |
|
95 |
+gem 'feedjira', '~> 2.0' |
|
96 | 96 |
gem 'font-awesome-sass', '~> 4.3.2' |
97 | 97 |
gem 'foreman', '~> 0.63.0' |
98 | 98 |
gem 'geokit', '~> 1.8.4' |
@@ -106,6 +106,7 @@ gem 'jsonpathv2', '~> 0.0.8' |
||
106 | 106 |
gem 'kaminari', github: "amatsuda/kaminari", branch: '0-17-stable' |
107 | 107 |
gem 'kramdown', '~> 1.3.3' |
108 | 108 |
gem 'liquid', '~> 3.0.3' |
109 |
+gem 'loofah', '~> 2.0' |
|
109 | 110 |
gem 'mini_magick' |
110 | 111 |
gem 'multi_xml' |
111 | 112 |
gem 'nokogiri', '1.6.8' |
@@ -214,9 +214,11 @@ GEM |
||
214 | 214 |
extlib (0.9.16) |
215 | 215 |
faraday (0.9.1) |
216 | 216 |
multipart-post (>= 1.2, < 3) |
217 |
- feed-normalizer (1.5.2) |
|
218 |
- hpricot (>= 0.6) |
|
219 |
- simple-rss (>= 1.1) |
|
217 |
+ feedjira (2.0.0) |
|
218 |
+ faraday (~> 0.9) |
|
219 |
+ faraday_middleware (~> 0.9) |
|
220 |
+ loofah (~> 2.0) |
|
221 |
+ sax-machine (~> 1.0) |
|
220 | 222 |
ffi (1.9.10) |
221 | 223 |
font-awesome-sass (4.3.2.1) |
222 | 224 |
sass (~> 3.2) |
@@ -269,7 +271,6 @@ GEM |
||
269 | 271 |
haversine (0.3.0) |
270 | 272 |
hipchat (1.2.0) |
271 | 273 |
httparty |
272 |
- hpricot (0.8.6) |
|
273 | 274 |
httmultiparty (0.3.16) |
274 | 275 |
httparty (>= 0.7.3) |
275 | 276 |
mimemagic |
@@ -486,6 +487,7 @@ GEM |
||
486 | 487 |
sprockets (>= 2.8, < 4.0) |
487 | 488 |
sprockets-rails (>= 2.0, < 4.0) |
488 | 489 |
tilt (>= 1.1, < 3) |
490 |
+ sax-machine (1.3.2) |
|
489 | 491 |
select2-rails (3.5.9.3) |
490 | 492 |
thor (~> 0.14) |
491 | 493 |
shellany (0.0.1) |
@@ -496,7 +498,6 @@ GEM |
||
496 | 498 |
faraday (>= 0.9.0.rc5) |
497 | 499 |
jwt (>= 0.1.5) |
498 | 500 |
multi_json (>= 1.0.0) |
499 |
- simple-rss (1.3.1) |
|
500 | 501 |
simple_oauth (0.3.1) |
501 | 502 |
simplecov (0.9.2) |
502 | 503 |
docile (~> 1.1.0) |
@@ -619,7 +620,7 @@ DEPENDENCIES |
||
619 | 620 |
evernote_oauth |
620 | 621 |
faraday (~> 0.9.0) |
621 | 622 |
faraday_middleware! |
622 |
- feed-normalizer |
|
623 |
+ feedjira (~> 2.0) |
|
623 | 624 |
ffi (>= 1.9.4) |
624 | 625 |
font-awesome-sass (~> 4.3.2) |
625 | 626 |
forecast_io (~> 2.0.0) |
@@ -644,6 +645,7 @@ DEPENDENCIES |
||
644 | 645 |
letter_opener_web (~> 1.3.0) |
645 | 646 |
liquid (~> 3.0.3) |
646 | 647 |
listen (~> 3.0.5) |
648 |
+ loofah (~> 2.0) |
|
647 | 649 |
mini_magick |
648 | 650 |
mqtt |
649 | 651 |
multi_xml |
@@ -1,6 +1,3 @@ |
||
1 |
-require 'rss' |
|
2 |
-require 'feed-normalizer' |
|
3 |
- |
|
4 | 1 |
module Agents |
5 | 2 |
class RssAgent < Agent |
6 | 3 |
include WebRequestConcern |
@@ -9,21 +6,23 @@ module Agents |
||
9 | 6 |
can_dry_run! |
10 | 7 |
default_schedule "every_1d" |
11 | 8 |
|
9 |
+ gem_dependency_check { defined?(Feedjira::Feed) } |
|
10 |
+ |
|
12 | 11 |
DEFAULT_EVENTS_ORDER = [['{{date_published}}', 'time'], ['{{last_updated}}', 'time']] |
13 | 12 |
|
14 | 13 |
description do |
15 | 14 |
<<-MD |
16 | 15 |
The RSS Agent consumes RSS feeds and emits events when they change. |
17 | 16 |
|
18 |
- This Agent is fairly simple, using [feed-normalizer](https://github.com/aasmith/feed-normalizer) as a base. For complex feeds |
|
19 |
- with additional field types, we recommend using a WebsiteAgent. See [this example](https://github.com/cantino/huginn/wiki/Agent-configuration-examples#itunes-trailers). |
|
17 |
+ This agent, using [Feedjira](https://github.com/feedjira/feedjira) as a base, can parse various types of RSS and Atom feeds and has some special handlers for FeedBurner, iTunes RSS, and so on. However, supported fields are limited by its general and abstract nature. For complex feeds with additional field types, we recommend using a WebsiteAgent. See [this example](https://github.com/cantino/huginn/wiki/Agent-configuration-examples#itunes-trailers). |
|
20 | 18 |
|
21 | 19 |
If you want to *output* an RSS feed, use the DataOutputAgent. |
22 | 20 |
|
23 | 21 |
Options: |
24 | 22 |
|
25 | 23 |
* `url` - The URL of the RSS feed (an array of URLs can also be used; items with identical guids across feeds will be considered duplicates). |
26 |
- * `clean` - Attempt to use [feed-normalizer](https://github.com/aasmith/feed-normalizer)'s' `clean!` method to cleanup HTML in the feed. Set to `true` to use. |
|
24 |
+ * `include_feed_info` - Set to `true` to include feed information in each event. |
|
25 |
+ * `clean` - Set to `true` to sanitize `description` and `content` as HTML fragments, removing unknown/unsafe elements and attributes. |
|
27 | 26 |
* `expected_update_period_in_days` - How often you expect this RSS feed to change. If more than this amount of time passes without an update, the Agent will mark itself as not working. |
28 | 27 |
* `headers` - When present, it should be a hash of headers to send with the request. |
29 | 28 |
* `basic_auth` - Specify HTTP basic auth parameters: `"username:password"`, or `["username", "password"]`. |
@@ -53,18 +52,46 @@ module Agents |
||
53 | 52 |
Events look like: |
54 | 53 |
|
55 | 54 |
{ |
55 |
+ "feed": { |
|
56 |
+ "id": "...", |
|
57 |
+ "type": "atom", |
|
58 |
+ "generator": "...", |
|
59 |
+ "url": "http://example.com/", |
|
60 |
+ "links": [ |
|
61 |
+ { "href": "http://example.com/", "rel": "alternate", "type": "text/html" }, |
|
62 |
+ { "href": "http://example.com/index.atom", "rel": "self", "type": "application/atom+xml" } |
|
63 |
+ ], |
|
64 |
+ "title": "Some site title", |
|
65 |
+ "description": "Some site description", |
|
66 |
+ "copyright": "...", |
|
67 |
+ "icon": "http://example.com/icon.png", |
|
68 |
+ "authors": [ "..." ], |
|
69 |
+ "date_published": "2014-09-11T01:30:00-07:00", |
|
70 |
+ "last_updated": "2014-09-11T01:30:00-07:00" |
|
71 |
+ }, |
|
56 | 72 |
"id": "829f845279611d7925146725317b868d", |
57 |
- "date_published": "2014-09-11 01:30:00 -0700", |
|
58 |
- "last_updated": "Thu, 11 Sep 2014 01:30:00 -0700", |
|
59 | 73 |
"url": "http://example.com/...", |
60 | 74 |
"urls": [ "http://example.com/..." ], |
75 |
+ "links": [ |
|
76 |
+ { "href": "http://example.com/...", "rel": "alternate" }, |
|
77 |
+ ], |
|
78 |
+ "title": "Some title", |
|
61 | 79 |
"description": "Some description", |
62 | 80 |
"content": "Some content", |
63 |
- "title": "Some title", |
|
64 |
- "authors": [ ... ], |
|
65 |
- "categories": [ ... ] |
|
81 |
+ "authors": [ "Some Author <email@address>" ], |
|
82 |
+ "categories": [ "..." ], |
|
83 |
+ "enclosure": { |
|
84 |
+ "url" => "http://example.com/file.mp3", "type" => "audio/mpeg", "length" => "123456789" |
|
85 |
+ }, |
|
86 |
+ "date_published": "2014-09-11T01:30:00-0700", |
|
87 |
+ "last_updated": "2014-09-11T01:30:00-0700" |
|
66 | 88 |
} |
67 | 89 |
|
90 |
+ Some notes: |
|
91 |
+ |
|
92 |
+ - The `feed` key is present only if `include_feed_info` is set to true. |
|
93 |
+ - Each element in `authors` is a string normalized in the format "*name* <*email*> (*url*)", where each space-separated part is optional. |
|
94 |
+ - Timestamps are converted to the ISO 8601 format. |
|
68 | 95 |
MD |
69 | 96 |
|
70 | 97 |
def working? |
@@ -104,8 +131,7 @@ module Agents |
||
104 | 131 |
begin |
105 | 132 |
response = faraday.get(url) |
106 | 133 |
if response.success? |
107 |
- feed = FeedNormalizer::FeedNormalizer.parse(response.body, loose: true) |
|
108 |
- feed.clean! if boolify(interpolated['clean']) |
|
134 |
+ feed = Feedjira::Feed.parse(response.body) |
|
109 | 135 |
new_events.concat feed_to_events(feed) |
110 | 136 |
else |
111 | 137 |
error "Failed to fetch #{url}: #{response.inspect}" |
@@ -128,10 +154,6 @@ module Agents |
||
128 | 154 |
log "Fetched #{urls.to_sentence} and created #{created_event_count} event(s)." |
129 | 155 |
end |
130 | 156 |
|
131 |
- def get_entry_id(entry) |
|
132 |
- entry.id.presence || Digest::MD5.hexdigest(entry.content) |
|
133 |
- end |
|
134 |
- |
|
135 | 157 |
def check_and_track(entry_id) |
136 | 158 |
memory['seen_ids'] ||= [] |
137 | 159 |
if memory['seen_ids'].include?(entry_id) |
@@ -143,21 +165,71 @@ module Agents |
||
143 | 165 |
end |
144 | 166 |
end |
145 | 167 |
|
168 |
+ unless dependencies_missing? |
|
169 |
+ require 'feedjira_extension' |
|
170 |
+ end |
|
171 |
+ |
|
172 |
+ def feed_data(feed) |
|
173 |
+ type = |
|
174 |
+ case feed.class.name |
|
175 |
+ when /Atom/ |
|
176 |
+ 'atom' |
|
177 |
+ else |
|
178 |
+ 'rss' |
|
179 |
+ end |
|
180 |
+ |
|
181 |
+ { |
|
182 |
+ id: feed.feed_id, |
|
183 |
+ type: type, |
|
184 |
+ url: feed.url, |
|
185 |
+ links: feed.links, |
|
186 |
+ title: feed.title, |
|
187 |
+ description: feed.description, |
|
188 |
+ copyright: feed.copyright, |
|
189 |
+ generator: feed.generator, |
|
190 |
+ icon: feed.icon, |
|
191 |
+ authors: feed.authors, |
|
192 |
+ date_published: feed.date_published, |
|
193 |
+ last_updated: feed.last_updated, |
|
194 |
+ } |
|
195 |
+ end |
|
196 |
+ |
|
197 |
+ def entry_data(entry) |
|
198 |
+ { |
|
199 |
+ id: entry.id, |
|
200 |
+ url: entry.url, |
|
201 |
+ urls: entry.links.map(&:href), |
|
202 |
+ links: entry.links, |
|
203 |
+ title: entry.title, |
|
204 |
+ description: clean_fragment(entry.summary), |
|
205 |
+ content: clean_fragment(entry.content || entry.summary), |
|
206 |
+ image: entry.try(:image), |
|
207 |
+ enclosure: entry.enclosure, |
|
208 |
+ authors: entry.authors, |
|
209 |
+ categories: Array(entry.try(:categories)), |
|
210 |
+ date_published: entry.date_published, |
|
211 |
+ last_updated: entry.last_updated, |
|
212 |
+ } |
|
213 |
+ end |
|
214 |
+ |
|
146 | 215 |
def feed_to_events(feed) |
216 |
+ payload_base = {} |
|
217 |
+ |
|
218 |
+ if boolify(interpolated['include_feed_info']) |
|
219 |
+ payload_base[:feed] = feed_data(feed) |
|
220 |
+ end |
|
221 |
+ |
|
147 | 222 |
feed.entries.map { |entry| |
148 |
- Event.new(payload: { |
|
149 |
- id: get_entry_id(entry), |
|
150 |
- date_published: entry.date_published, |
|
151 |
- last_updated: entry.last_updated, |
|
152 |
- url: entry.url, |
|
153 |
- urls: entry.urls, |
|
154 |
- description: entry.description, |
|
155 |
- content: entry.content, |
|
156 |
- title: entry.title, |
|
157 |
- authors: entry.authors, |
|
158 |
- categories: entry.categories |
|
159 |
- }) |
|
223 |
+ Event.new(payload: payload_base.merge(entry_data(entry))) |
|
160 | 224 |
} |
161 | 225 |
end |
226 |
+ |
|
227 |
+ def clean_fragment(fragment) |
|
228 |
+ if boolify(interpolated['clean']) && fragment.present? |
|
229 |
+ Loofah.scrub_fragment(fragment, :prune).to_s |
|
230 |
+ else |
|
231 |
+ fragment |
|
232 |
+ end |
|
233 |
+ end |
|
162 | 234 |
end |
163 | 235 |
end |
@@ -0,0 +1,286 @@ |
||
1 |
+require 'feedjira' |
|
2 |
+require 'digest' |
|
3 |
+require 'mail' |
|
4 |
+ |
|
5 |
+module FeedjiraExtension |
|
6 |
+ AUTHOR_ATTRS = %i[name email uri] |
|
7 |
+ LINK_ATTRS = %i[href rel type hreflang title length] |
|
8 |
+ ENCLOSURE_ATTRS = %i[url type length] |
|
9 |
+ |
|
10 |
+ class Author < Struct.new(*AUTHOR_ATTRS) |
|
11 |
+ def to_json(options = nil) |
|
12 |
+ members.flat_map { |key| |
|
13 |
+ if value = self[key].presence |
|
14 |
+ case key |
|
15 |
+ when :email |
|
16 |
+ "<#{value}>" |
|
17 |
+ when :uri |
|
18 |
+ "(#{value})" |
|
19 |
+ else |
|
20 |
+ value |
|
21 |
+ end |
|
22 |
+ else |
|
23 |
+ [] |
|
24 |
+ end |
|
25 |
+ }.join(' ').to_json(options) |
|
26 |
+ end |
|
27 |
+ end |
|
28 |
+ |
|
29 |
+ class AtomAuthor < Author |
|
30 |
+ include SAXMachine |
|
31 |
+ |
|
32 |
+ AUTHOR_ATTRS.each do |attr| |
|
33 |
+ element attr |
|
34 |
+ end |
|
35 |
+ end |
|
36 |
+ |
|
37 |
+ class RssAuthor < Author |
|
38 |
+ include SAXMachine |
|
39 |
+ |
|
40 |
+ def content=(content) |
|
41 |
+ super |
|
42 |
+ |
|
43 |
+ begin |
|
44 |
+ addr = Mail::Address.new(content) |
|
45 |
+ rescue |
|
46 |
+ self.name = content |
|
47 |
+ else |
|
48 |
+ self.name = addr.name |
|
49 |
+ self.email = addr.address |
|
50 |
+ end |
|
51 |
+ end |
|
52 |
+ |
|
53 |
+ value :content |
|
54 |
+ end |
|
55 |
+ |
|
56 |
+ class Enclosure |
|
57 |
+ include SAXMachine |
|
58 |
+ |
|
59 |
+ ENCLOSURE_ATTRS.each do |attr| |
|
60 |
+ attribute attr |
|
61 |
+ end |
|
62 |
+ |
|
63 |
+ def to_json(options = nil) |
|
64 |
+ ENCLOSURE_ATTRS.each_with_object({}) { |key, hash| |
|
65 |
+ if value = __send__(key) |
|
66 |
+ hash[key] = value |
|
67 |
+ end |
|
68 |
+ }.to_json(options) |
|
69 |
+ end |
|
70 |
+ end |
|
71 |
+ |
|
72 |
+ class AtomLink |
|
73 |
+ include SAXMachine |
|
74 |
+ |
|
75 |
+ LINK_ATTRS.each do |attr| |
|
76 |
+ attribute attr |
|
77 |
+ end |
|
78 |
+ |
|
79 |
+ def to_json(options = nil) |
|
80 |
+ LINK_ATTRS.each_with_object({}) { |key, hash| |
|
81 |
+ if value = __send__(key) |
|
82 |
+ hash[key] = value |
|
83 |
+ end |
|
84 |
+ }.to_json(options) |
|
85 |
+ end |
|
86 |
+ end |
|
87 |
+ |
|
88 |
+ class RssLinkElement |
|
89 |
+ include SAXMachine |
|
90 |
+ |
|
91 |
+ value :href |
|
92 |
+ |
|
93 |
+ def to_json(options = nil) |
|
94 |
+ { |
|
95 |
+ href: href |
|
96 |
+ }.to_json(options) |
|
97 |
+ end |
|
98 |
+ end |
|
99 |
+ |
|
100 |
+ module HasAuthors |
|
101 |
+ def self.included(mod) |
|
102 |
+ mod.module_exec do |
|
103 |
+ case name |
|
104 |
+ when /RSS/ |
|
105 |
+ %w[ |
|
106 |
+ itunes:author |
|
107 |
+ dc:creator |
|
108 |
+ author |
|
109 |
+ managingEditor |
|
110 |
+ ].each do |name| |
|
111 |
+ sax_config.top_level_elements[name].clear |
|
112 |
+ |
|
113 |
+ elements name, class: RssAuthor, as: :authors |
|
114 |
+ end |
|
115 |
+ else |
|
116 |
+ elements :author, class: AtomAuthor, as: :authors |
|
117 |
+ end |
|
118 |
+ |
|
119 |
+ def alternate_link |
|
120 |
+ links.find { |link| |
|
121 |
+ link.is_a?(AtomLink) && |
|
122 |
+ link.rel == 'alternate' && |
|
123 |
+ (link.type == 'text/html'|| link.type.nil?) |
|
124 |
+ } |
|
125 |
+ end |
|
126 |
+ |
|
127 |
+ def url |
|
128 |
+ @url ||= (alternate_link || links.first).try!(:href) |
|
129 |
+ end |
|
130 |
+ end |
|
131 |
+ end |
|
132 |
+ end |
|
133 |
+ |
|
134 |
+ module HasEnclosure |
|
135 |
+ def self.included(mod) |
|
136 |
+ mod.module_exec do |
|
137 |
+ sax_config.top_level_elements['enclosure'].clear |
|
138 |
+ |
|
139 |
+ element :enclosure, class: Enclosure |
|
140 |
+ |
|
141 |
+ def image_enclosure |
|
142 |
+ case enclosure.try!(:type) |
|
143 |
+ when %r{\Aimage/} |
|
144 |
+ enclosure |
|
145 |
+ end |
|
146 |
+ end |
|
147 |
+ |
|
148 |
+ def image |
|
149 |
+ @image ||= image_enclosure.try!(:url) |
|
150 |
+ end |
|
151 |
+ end |
|
152 |
+ end |
|
153 |
+ end |
|
154 |
+ |
|
155 |
+ module HasLinks |
|
156 |
+ def self.included(mod) |
|
157 |
+ mod.module_exec do |
|
158 |
+ sax_config.top_level_elements['link'].clear |
|
159 |
+ sax_config.collection_elements['link'].clear |
|
160 |
+ |
|
161 |
+ case name |
|
162 |
+ when /RSS/ |
|
163 |
+ elements :link, class: RssLinkElement, as: :rss_links |
|
164 |
+ |
|
165 |
+ case name |
|
166 |
+ when /FeedBurner/ |
|
167 |
+ elements :'atok10:link', class: AtomLink, as: :atom_links |
|
168 |
+ |
|
169 |
+ def links |
|
170 |
+ @links ||= [*rss_links, *atom_links] |
|
171 |
+ end |
|
172 |
+ else |
|
173 |
+ alias_method :links, :rss_links |
|
174 |
+ end |
|
175 |
+ else |
|
176 |
+ elements :link, class: AtomLink, as: :links |
|
177 |
+ end |
|
178 |
+ |
|
179 |
+ def alternate_link |
|
180 |
+ links.find { |link| |
|
181 |
+ link.is_a?(AtomLink) && |
|
182 |
+ link.rel == 'alternate' && |
|
183 |
+ (link.type == 'text/html'|| link.type.nil?) |
|
184 |
+ } |
|
185 |
+ end |
|
186 |
+ |
|
187 |
+ def url |
|
188 |
+ @url ||= (alternate_link || links.first).try!(:href) |
|
189 |
+ end |
|
190 |
+ end |
|
191 |
+ end |
|
192 |
+ end |
|
193 |
+ |
|
194 |
+ module HasTimestamps |
|
195 |
+ attr_reader :published, :updated |
|
196 |
+ |
|
197 |
+ # Keep the "oldest" publish time found |
|
198 |
+ def published=(value) |
|
199 |
+ parsed = parse_datetime(value) |
|
200 |
+ @published = parsed if !@published || parsed < @published |
|
201 |
+ end |
|
202 |
+ |
|
203 |
+ # Keep the most recent update time found |
|
204 |
+ def updated=(value) |
|
205 |
+ parsed = parse_datetime(value) |
|
206 |
+ @updated = parsed if !@updated || parsed > @updated |
|
207 |
+ end |
|
208 |
+ |
|
209 |
+ def date_published |
|
210 |
+ published.try(:iso8601) |
|
211 |
+ end |
|
212 |
+ |
|
213 |
+ def last_updated |
|
214 |
+ (updated || published).try(:iso8601) |
|
215 |
+ end |
|
216 |
+ |
|
217 |
+ private |
|
218 |
+ |
|
219 |
+ def parse_datetime(string) |
|
220 |
+ DateTime.parse(string) rescue nil |
|
221 |
+ end |
|
222 |
+ end |
|
223 |
+ |
|
224 |
+ module FeedEntryExtensions |
|
225 |
+ def self.included(mod) |
|
226 |
+ mod.module_exec do |
|
227 |
+ include HasAuthors |
|
228 |
+ include HasEnclosure |
|
229 |
+ include HasLinks |
|
230 |
+ include HasTimestamps |
|
231 |
+ end |
|
232 |
+ end |
|
233 |
+ |
|
234 |
+ def id |
|
235 |
+ entry_id || Digest::MD5.hexdigest(content || summary || '') |
|
236 |
+ end |
|
237 |
+ end |
|
238 |
+ |
|
239 |
+ module FeedExtensions |
|
240 |
+ def self.included(mod) |
|
241 |
+ mod.module_exec do |
|
242 |
+ include HasAuthors |
|
243 |
+ include HasEnclosure |
|
244 |
+ include HasLinks |
|
245 |
+ include HasTimestamps |
|
246 |
+ |
|
247 |
+ element :id, as: :feed_id |
|
248 |
+ element :generator |
|
249 |
+ elements :rights |
|
250 |
+ element :published |
|
251 |
+ element :updated |
|
252 |
+ element :icon |
|
253 |
+ |
|
254 |
+ if /RSS/ === name |
|
255 |
+ element :guid, as: :feed_id |
|
256 |
+ element :copyright |
|
257 |
+ element :pubDate, as: :published |
|
258 |
+ element :'dc:date', as: :published |
|
259 |
+ element :lastBuildDate, as: :updated |
|
260 |
+ element :image, value: :url, as: :icon |
|
261 |
+ |
|
262 |
+ def copyright |
|
263 |
+ @copyright || super |
|
264 |
+ end |
|
265 |
+ end |
|
266 |
+ |
|
267 |
+ sax_config.collection_elements.each_value do |collection_elements| |
|
268 |
+ collection_elements.each do |collection_element| |
|
269 |
+ collection_element.accessor == 'entries' && |
|
270 |
+ (entry_class = collection_element.data_class).is_a?(Class) or next |
|
271 |
+ |
|
272 |
+ entry_class.send :include, FeedEntryExtensions |
|
273 |
+ end |
|
274 |
+ end |
|
275 |
+ end |
|
276 |
+ end |
|
277 |
+ |
|
278 |
+ def copyright |
|
279 |
+ rights.join("\n").presence |
|
280 |
+ end |
|
281 |
+ end |
|
282 |
+ |
|
283 |
+ Feedjira::Feed.feed_classes.each do |feed_class| |
|
284 |
+ feed_class.send :include, FeedExtensions |
|
285 |
+ end |
|
286 |
+end |
@@ -44,6 +44,7 @@ |
||
44 | 44 |
<category>calendar</category> |
45 | 45 |
<category>menubar</category> |
46 | 46 |
<category>osx</category> |
47 |
+ <enclosure url="http://c.1tw.org/images/2015/itsy.png" length="48249" type="image/png" /> |
|
47 | 48 |
</item> |
48 | 49 |
<item> |
49 | 50 |
<title>Magic Wormhole</title> |
@@ -208,8 +209,7 @@ |
||
208 | 209 |
</item> |
209 | 210 |
<item> |
210 | 211 |
<title>Showgoers</title> |
211 |
- <description><a href="http://showgoers.tv/">Showgoers</a>: <blockquote> <p>Showgoers is a Chrome browser extension to synchronize your Netflix player with someone else so that you can co-watch the same movie on different computers with no hassle. Syncing up your player is as easy as sharing a URL.</p> </blockquote> |
|
212 |
- </description> |
|
212 |
+ <description><a href="http://showgoers.tv/" onmouseover="javascript:void(0)">Showgoers</a>: <blockquote> <p>Showgoers is a Chrome browser extension to synchronize your Netflix player with someone else so that you can co-watch the same movie on different computers with no hassle. Syncing up your player is as easy as sharing a URL.</p> </blockquote><script>some code</script></description> |
|
213 | 213 |
<link>http://onethingwell.org/post/125509667816</link> |
214 | 214 |
<guid>http://onethingwell.org/post/125509667816</guid> |
215 | 215 |
<pubDate>Fri, 31 Jul 2015 13:00:13 +0100</pubDate> |
@@ -55,16 +55,57 @@ describe Agents::RssAgent do |
||
55 | 55 |
end |
56 | 56 |
|
57 | 57 |
describe "emitting RSS events" do |
58 |
- it "should emit items as events" do |
|
58 |
+ it "should emit items as events for an Atom feed" do |
|
59 |
+ agent.options['include_feed_info'] = true |
|
60 |
+ |
|
59 | 61 |
expect { |
60 | 62 |
agent.check |
61 | 63 |
}.to change { agent.events.count }.by(20) |
62 | 64 |
|
63 | 65 |
first, *, last = agent.events.last(20) |
66 |
+ [first, last].each do |event| |
|
67 |
+ expect(first.payload['feed']).to include({ |
|
68 |
+ "type" => "atom", |
|
69 |
+ "title" => "Recent Commits to huginn:master", |
|
70 |
+ "url" => "https://github.com/cantino/huginn/commits/master", |
|
71 |
+ "links" => [ |
|
72 |
+ { |
|
73 |
+ "type" => "text/html", |
|
74 |
+ "rel" => "alternate", |
|
75 |
+ "href" => "https://github.com/cantino/huginn/commits/master", |
|
76 |
+ }, |
|
77 |
+ { |
|
78 |
+ "type" => "application/atom+xml", |
|
79 |
+ "rel" => "self", |
|
80 |
+ "href" => "https://github.com/cantino/huginn/commits/master.atom", |
|
81 |
+ }, |
|
82 |
+ ], |
|
83 |
+ }) |
|
84 |
+ end |
|
64 | 85 |
expect(first.payload['url']).to eq("https://github.com/cantino/huginn/commit/d0a844662846cf3c83b94c637c1803f03db5a5b0") |
65 | 86 |
expect(first.payload['urls']).to eq(["https://github.com/cantino/huginn/commit/d0a844662846cf3c83b94c637c1803f03db5a5b0"]) |
87 |
+ expect(first.payload['links']).to eq([ |
|
88 |
+ { |
|
89 |
+ "href" => "https://github.com/cantino/huginn/commit/d0a844662846cf3c83b94c637c1803f03db5a5b0", |
|
90 |
+ "rel" => "alternate", |
|
91 |
+ "type" => "text/html", |
|
92 |
+ } |
|
93 |
+ ]) |
|
94 |
+ expect(first.payload['authors']).to eq(["cantino (https://github.com/cantino)"]) |
|
95 |
+ expect(first.payload['date_published']).to be_nil |
|
96 |
+ expect(first.payload['last_updated']).to eq("2014-07-16T22:26:22-07:00") |
|
66 | 97 |
expect(last.payload['url']).to eq("https://github.com/cantino/huginn/commit/d465158f77dcd9078697e6167b50abbfdfa8b1af") |
67 | 98 |
expect(last.payload['urls']).to eq(["https://github.com/cantino/huginn/commit/d465158f77dcd9078697e6167b50abbfdfa8b1af"]) |
99 |
+ expect(last.payload['links']).to eq([ |
|
100 |
+ { |
|
101 |
+ "href" => "https://github.com/cantino/huginn/commit/d465158f77dcd9078697e6167b50abbfdfa8b1af", |
|
102 |
+ "rel" => "alternate", |
|
103 |
+ "type" => "text/html", |
|
104 |
+ } |
|
105 |
+ ]) |
|
106 |
+ expect(last.payload['authors']).to eq(["CloCkWeRX (https://github.com/CloCkWeRX)"]) |
|
107 |
+ expect(last.payload['date_published']).to be_nil |
|
108 |
+ expect(last.payload['last_updated']).to eq("2014-07-01T16:37:47+09:30") |
|
68 | 109 |
end |
69 | 110 |
|
70 | 111 |
it "should emit items as events in the order specified in the events_order option" do |
@@ -82,6 +123,33 @@ describe Agents::RssAgent do |
||
82 | 123 |
expect(last.payload['urls']).to eq(["https://github.com/cantino/huginn/commit/0e80f5341587aace2c023b06eb9265b776ac4535"]) |
83 | 124 |
end |
84 | 125 |
|
126 |
+ it "should emit items as events for a FeedBurner RSS 2.0 feed" do |
|
127 |
+ agent.options['url'] = "http://feeds.feedburner.com/SlickdealsnetFP?format=atom" # This is actually RSS 2.0 w/ Atom extension |
|
128 |
+ agent.options['include_feed_info'] = true |
|
129 |
+ agent.save! |
|
130 |
+ |
|
131 |
+ expect { |
|
132 |
+ agent.check |
|
133 |
+ }.to change { agent.events.count }.by(79) |
|
134 |
+ |
|
135 |
+ first, *, last = agent.events.last(79) |
|
136 |
+ expect(first.payload['feed']).to include({ |
|
137 |
+ "type" => "rss", |
|
138 |
+ "title" => "SlickDeals.net", |
|
139 |
+ "description" => "Slick online shopping deals.", |
|
140 |
+ "url" => "http://slickdeals.net/", |
|
141 |
+ }) |
|
142 |
+ # Feedjira extracts feedburner:origLink |
|
143 |
+ expect(first.payload['url']).to eq("http://slickdeals.net/permadeal/130160/green-man-gaming---pc-games-tomb-raider-game-of-the-year-6-hitman-absolution-elite-edition") |
|
144 |
+ expect(last.payload['feed']).to include({ |
|
145 |
+ "type" => "rss", |
|
146 |
+ "title" => "SlickDeals.net", |
|
147 |
+ "description" => "Slick online shopping deals.", |
|
148 |
+ "url" => "http://slickdeals.net/", |
|
149 |
+ }) |
|
150 |
+ expect(last.payload['url']).to eq("http://slickdeals.net/permadeal/129980/amazon---rearth-ringke-fusion-bumper-hybrid-case-for-iphone-6") |
|
151 |
+ end |
|
152 |
+ |
|
85 | 153 |
it "should track ids and not re-emit the same item when seen again" do |
86 | 154 |
agent.check |
87 | 155 |
expect(agent.memory['seen_ids']).to eq(agent.events.map {|e| e.payload['id'] }) |
@@ -155,17 +223,39 @@ describe Agents::RssAgent do |
||
155 | 223 |
@valid_options['url'] = 'http://onethingwell.org/rss' |
156 | 224 |
end |
157 | 225 |
|
226 |
+ it "captures timestamps normalized in the ISO 8601 format" do |
|
227 |
+ agent.check |
|
228 |
+ first, *, third = agent.events.take(3) |
|
229 |
+ expect(first.payload['date_published']).to eq('2015-08-20T17:00:10+01:00') |
|
230 |
+ expect(third.payload['date_published']).to eq('2015-08-20T13:00:07+01:00') |
|
231 |
+ end |
|
232 |
+ |
|
158 | 233 |
it "captures multiple categories" do |
159 | 234 |
agent.check |
160 | 235 |
first, *, third = agent.events.take(3) |
161 | 236 |
expect(first.payload['categories']).to eq(["csv", "crossplatform", "utilities"]) |
162 | 237 |
expect(third.payload['categories']).to eq(["web"]) |
163 | 238 |
end |
239 |
+ |
|
240 |
+ it "sanitizes HTML content" do |
|
241 |
+ agent.options['clean'] = true |
|
242 |
+ agent.check |
|
243 |
+ event = agent.events.last |
|
244 |
+ expect(event.payload['content']).to eq('<a href="http://showgoers.tv/">Showgoers</a>: <blockquote> <p>Showgoers is a Chrome browser extension to synchronize your Netflix player with someone else so that you can co-watch the same movie on different computers with no hassle. Syncing up your player is as easy as sharing a URL.</p> </blockquote>') |
|
245 |
+ expect(event.payload['description']).to eq('<a href="http://showgoers.tv/">Showgoers</a>: <blockquote> <p>Showgoers is a Chrome browser extension to synchronize your Netflix player with someone else so that you can co-watch the same movie on different computers with no hassle. Syncing up your player is as easy as sharing a URL.</p> </blockquote>') |
|
246 |
+ end |
|
247 |
+ |
|
248 |
+ it "captures an enclosure" do |
|
249 |
+ agent.check |
|
250 |
+ event = agent.events.fourth |
|
251 |
+ expect(event.payload['enclosure']).to eq({ "url" => "http://c.1tw.org/images/2015/itsy.png", "type" => "image/png", "length" => "48249" }) |
|
252 |
+ expect(event.payload['image']).to eq("http://c.1tw.org/images/2015/itsy.png") |
|
253 |
+ end |
|
164 | 254 |
end |
165 | 255 |
|
166 | 256 |
describe 'logging errors with the feed url' do |
167 | 257 |
it 'includes the feed URL when an exception is raised' do |
168 |
- mock(FeedNormalizer::FeedNormalizer).parse(anything, loose: true) { raise StandardError.new("Some error!") } |
|
258 |
+ mock(Feedjira::Feed).parse(anything) { raise StandardError.new("Some error!") } |
|
169 | 259 |
expect(lambda { |
170 | 260 |
agent.check |
171 | 261 |
}).not_to raise_error |